
if(T == T){ 
rm(list=ls())
analysis_date       = "2019-8-5"
use_RCE             = F
numericize_text     = F
compute_readme2     = T
compute_splits      = F
compute_classifiers = T
options(repos=structure(c(CRAN='http://cran.cnr.berkeley.edu/')),  INSTALL_opts = c('--no-lock' ))

try(devtools::install_github("iqss-research/readme-software/readme",force = FALSE, quiet = FALSE),T) 
library(readme)
mainDir <- "~/Downloads/FullReplicationMaterials" 
setwd( mainDir )
  
iterations <- 1; nboot <- 1
global_iter_seq <- 1:73; global_iter_seq[1:69] <- sample(1:69, 69)
labeled_sz_ <- 300; unlabeled_sz_ <- 300

my_seed = try(c(random::randomNumbers(n=1, min = 1,
                               max = 100000000,
                               col = 1 )), T) 
if(class(my_seed) == "try-error"){ 
my_seed = c(sum(as.logical(rawToBits(charToRaw(paste(Sys.time(),Sys.getenv(),Sys.getpid(), collapse = " "))))))
}
set.seed(my_seed)

sampling_scheme <- "BalancedCombination"

eval_discrete <- 'try( DiscreteTest(
dtm=as.matrix(data.table::fread(cmd = dtm_pastein )[,-1])[,colMeans(data.table::fread(cmd = dtm_pastein )[,-1])>0.01], 
labeledIndicator=iter_labeledIndicator, 
categoryVec=iter_categoryVec,
compareQuantifiers = T), T)'

eval_continuous <- 'try( ContinuousTest(
dfm=as.matrix(data.table::fread(cmd = dfm_pastein )[,-1]), 
labeledIndicator=iter_labeledIndicator, 
categoryVec=iter_categoryVec, 
nboot = nboot), T)'

### List all of the Crimson Hexagon CSVs
support_files_numericize <- c("readme2_numericize_helper_fxns.R")
support_files_samp <- c("readme2_sampling.R","readme2_sampling_helper_fxns.R")
support_files_class <- c("readme2_testing.R","readme2_testing_helper_fxns.R")
filename_saved_data = "PrecomputedMaterial"
filename_saved_results = sprintf("Results_%s_%s", sampling_scheme , analysis_date )
filename_support = "Readme2_SupportCode"

if( !dir.exists(sprintf("%s/%s", mainDir, filename_saved_results))){ dir.create(sprintf("%s/%s", mainDir, filename_saved_results))}

counter_ <- 0 
for(ijack in global_iter_seq){
  counter_ <- counter_ + 1 
  
  print(counter_)
  ### Initialize placeholder for storing the errors
  errors <- data.frame()
  
  if(ijack < 70){
    listed_files = list.files('./')
    dtm_loc=sort(listed_files[grepl(listed_files,pattern="DTM_") & !grepl(listed_files,pattern=" ")])[ijack]
    dfm_loc=sort(listed_files[grepl(listed_files,pattern="DFM_") ])[ijack]
    cats_loc=sort(listed_files[grepl(listed_files,pattern="CATS_") ])[ijack]
    splits_loc=sort(listed_files[grepl(listed_files,pattern="SPLITS_") ])[ijack]
    load( file = splits_loc)
    out_file <- sprintf("results_%s.csv",ijack)
  }
  if(ijack ==  70){csv_name <- "enron"; out_file <- "enron_results.csv"}
  if(ijack ==  71){csv_name <- "clinton"; out_file <- "clinton_results.csv"}
  if(ijack ==  72){csv_name <- "immigration";out_file <- "immigration_results.csv"}
  if(ijack ==  73){csv_name <- "stanford";out_file <- "stanford_results.csv"}
  if(ijack >= 70){
    dtm_loc  = paste("./", "/DTM_", csv_name, sep = "")
    dfm_loc  = paste("./", "/DFM_", csv_name, sep = "")
    cats_loc = paste("./", "/CATS_", csv_name, sep = "")
   load( file = sprintf("./SPLITS_%s.RData" ,
                                      gsub(csv_name, pattern = ".csv", replace = "")))
  }
  
  if(numericize_text == F){ 
      corpus_categoryVec <- try(c(as.character(read.csv(file = cats_loc)[,-1])), T)
  }

  if(numericize_text == T){ 
      numericize_fxns_added <- ls(); for(support_file in support_files_numericize){source(sprintf("./%s/%s",filename_support, support_file)) }
      numericize_fxns_added <- ls()[!ls() %in% numericize_fxns_added]
      
      #Part 1 - read in the data 
      { 
      if(ijack == length(csv_names) + 1){ 
        csv_name <- "enron"; out_file <- "enron_results.csv" 
        csv_input <- read.table("./validationdata/enrontexts/controlenron.txt", sep = ",", header = T)
        time_stamp <- read.csv("./validationdata/enron_date_time_data.csv", stringsAsFactors=F)
        enron_files <- list.files("./validationdata/enrontexts/")
        csv_input <- cbind(csv_input, sapply(as.character(csv_input[,1]), function(x){paste(readLines(x), collapse = " ")}))

        ### Load time-stamps dataset
        csv_input[,1] <- gsub(x = csv_input[,1], pattern = "validationdata/enrontexts/",  replace = "")
        time_stamp$path <- gsub(time_stamp$path,  pattern = "validationdata/enrontexts/",  replace = "")
        dates_mat = as.data.frame( matrix(NA, nrow = nrow(time_stamp), ncol =0 ))  
        dates_mat$year_XXX <- time_stamp$year_XXX[base::match(time_stamp$path,csv_input[,1])]
        dates_mat$month_XXX <- time_stamp$month_XXX[base::match(time_stamp$path,csv_input[,1])]
        dates_mat$day_XXX <- time_stamp$day_XXX[base::match(time_stamp$path,csv_input[,1])]
        enron_ordering <- order(dates_mat$year_XXX, dates_mat$month_XXX, dates_mat$day_XXX) 
        
        csv_input = csv_input[enron_ordering,]
        my_text <- cleanme(as.character( csv_input[,4] )  )
        corpus_DTM <- toDTM(my_text)
        corpus_categoryVec = csv_input[,2]
        
        rm(enron_ordering);rm(dates_mat); rm(csv_input)
      } 
      
      if(ijack == length(csv_names) + 2){
        csv_input <- read.table("./validationdata/clintonposts/control.txt", sep = ",", header = T)
        csv_input <- cbind(csv_input,sapply(as.character(csv_input[,1]), function(x){paste(readLines(x), collapse = " ")}))
        csv_input = csv_input[csv_input[,2]!=0,]
        csv_input[,4] = iconv(csv_input[,4], from = "UTF-8", to = "ASCII", sub = '', mark = T)
        csv_input <- csv_input[sapply(csv_input[,4], nchar)>200,]#remove outliers 
        csv_input <- csv_input[sapply(csv_input[,4], nchar)<10000,]#remove outliers 
        
        corpus_DTM <- toDTM(csv_input[,4])
        corpus_categoryVec <- csv_input[,2]
        my_text <- cleanme(csv_input[,4])

        rm(csv_input);
      } 
      
      if(ijack == length(csv_names) + 3){ 
        csv_input <- read.table("./validationdata/immigtexts/controlimmig.txt", sep = ",", header = T)
        csv_input <- cbind(csv_input,tolower(  sapply(as.character(csv_input[,1]), function(x){enc2utf8(paste(readLines(x), collapse = " ") )}) ))
        
        ### sort immigration_undergrad by 
        date_vec <- read.table( "validationdata/immigtexts/controlimmig.txt", sep = ",", header = T)[,1]
        date_vec <- gsub("^.*([[:digit:]]{8}).*$","\\1",as.character(date_vec))
        ## Fix non-matched element
        date_vec[grep("immigtexts",date_vec)] <- "25062007"
        
        ## Parse date_vec as a date object
        immigration_dates <- strptime(date_vec, "%d%m%Y")
        
        ## Reorder thing
        csv_input = csv_input[order(immigration_dates),]
        my_text <- cleanme(as.character(csv_input[,ncol(csv_input)]))
        corpus_DTM <- toDTM(my_text)
        corpus_categoryVec <- csv_input$truth
        rm(csv_input)
      }
      
      if(ijack == length(csv_names) + 4){
        Dictionary <- do.call(rbind, strsplit(readLines("./Quantification Replication/stanfordSentimentTreebank/dictionary.txt", 
                                                        encoding = "UTF-8"), split = "\\|") )
        Dictionary[,2] <- iconv(Dictionary[,2], from="UTF-8", to="LATIN1")
        DataSentences <- do.call(rbind, strsplit(readLines("./Quantification Replication/stanfordSentimentTreebank/datasetSentences.txt", 
                                                           encoding = "macintosh"), split = "\\t") )[-1,]
        DataSentences[,2] <- iconv(DataSentences[,2], from="UTF-8", to="LATIN1")
        DataSentences[,2] <- gsub( DataSentences[,2], pattern = "-LRB-", replace = "(") 
        DataSentences[,2] <- gsub( DataSentences[,2], pattern = "-RRB-", replace = ")")
        DataSplits <- do.call(rbind, strsplit(readLines("./Quantification Replication/stanfordSentimentTreebank/datasetSplit.txt", encoding = "UTF-8"), 
                                              split = ",") )[-1,]
        SentimentLabels <- do.call(rbind, strsplit(readLines("./Quantification Replication/stanfordSentimentTreebank/sentiment_labels.txt", encoding = "UTF-8"), 
                                                   split = "\\|"))[-1,]
        DictTextVec <- Dictionary[,1]
        DictIDVec <- f2n(Dictionary[,2])
        PhraseLabelsIDVec <- f2n(SentimentLabels[,1])
        PhraseLabelsValueVec <- f2n(  SentimentLabels[,2] )  
        SentenceTextVec <- DataSentences[,2]
        MasterSentencesDf <- cbind(DataSentences, matrix(logical(), ncol = 4, nrow = length(SentenceTextVec)))
        SentencePhraseIDVec <- SentenceSentimentCatVec <- SentenceSentimentValueVec <- rep(NA, times = length(SentenceTextVec))
        colnames(MasterSentencesDf) <- c("SentenceID", "Sentence", "SentenceIDAsPhrase", 
                                         "SentimentValue", "SentimentLabel", "FromSet")
        bad_indices <- c()
        for(badger in 1:length(SentenceTextVec)){
          SentPhraseID <- DictIDVec[which(DictTextVec == SentenceTextVec[badger])]
          if(length(SentPhraseID) == 0){ bad_indices <- c(badger, bad_indices) } 
          if(length(SentPhraseID) > 0){ 
            SentencePhraseIDVec[badger] <- SentPhraseID
            SentenceSentimentValueVec[badger] <- PhraseLabelsValueVec[which(PhraseLabelsIDVec == SentPhraseID)]
          } 
        }
        
        #[0, 0.2], (0.2, 0.4], (0.4, 0.6], (0.6, 0.8], (0.8, 1.0]
        SentenceSentimentCatVec[SentenceSentimentValueVec <=0.20] <- 1
        SentenceSentimentCatVec[SentenceSentimentValueVec > 0.20 & SentenceSentimentValueVec <= 0.40] <- 2
        SentenceSentimentCatVec[SentenceSentimentValueVec > 0.40 & SentenceSentimentValueVec <= 0.60] <- 3
        SentenceSentimentCatVec[SentenceSentimentValueVec > 0.60 & SentenceSentimentValueVec <= 0.80] <- 4
        SentenceSentimentCatVec[SentenceSentimentValueVec > 0.80] <- 5
        
        MasterSentencesDf[,"SentenceIDAsPhrase"] <- SentencePhraseIDVec
        MasterSentencesDf[,"SentimentValue"] <- SentenceSentimentValueVec
        MasterSentencesDf[,"SentimentLabel"] <- SentenceSentimentCatVec
        MasterSentencesDf[,"FromSet"] <- DataSplits[,2]
        
        MasterSentencesDf <- MasterSentencesDf[!is.na(MasterSentencesDf[,"SentimentLabel"]),]
        my_text <-  cleanme( MasterSentencesDf[,"Sentence"] ) 
        corpus_DTM <- toDTM(my_text)
        colnames(corpus_DTM)[-c(1)] <- paste("V", 1:(ncol(corpus_DTM)-1), sep = "")
        
        corpus_categoryVec = SentenceSentimentCatVec
        corpus_DTM[,c(1:ncol(corpus_DTM))] <- apply(corpus_DTM[,c(1:ncol(corpus_DTM))], 2, f2n)
        rm(SentenceSentimentCatVec);rm(SentimentLabels);rm(MasterSentencesDf);
        rm(PhraseLabelsIDVec);rm(PhraseLabelsValueVec);rm(Dictionary)
        rm(DictTextVec);rm(DataSentences);rm(SentenceTextVec);rm(DictIDVec);
        rm(DataSplits)
      } 
      
      if(ijack <= length(csv_names)){ 
        ### For each CH CSV, Load CSV file
        csv_name <- csv_names[ijack]
        cat(paste(csv_name, "\n"))
        csv_csvfile <- try(read.csv(paste("./validationdata/HandcodedCSVs/",csv_name, sep=""), sep=",", stringsAsFactors=F, encoding="latin1"), T)
        
        ### Sort CSV file by Timestamp
        csv_csvfile$Timestamp <- try( strptime(csv_csvfile$Timestamp, format="%m/%e/%y %H:%M"), T)
        csv_csvfile <- try( csv_csvfile[order(csv_csvfile$Timestamp),], T)
        
        my_text <- cleanme(csv_csvfile$Contents)
        corpus_DTM <- toDTM(my_text)
        corpus_categoryVec <- csv_csvfile$Category
        rm(csv_csvfile)
      }
      }
        
      #Part 2 - process it 
      { 
        if(!"wordVecs_corpus" %in% ls()){ 
        wordVecs_corpus <- data.table::fread(sprintf("./%s/glove.twitter.27B.200d.txt", "glove.twitter.27B"))
        wordVecs_rowNames <- wordVecs_corpus[[1]]
        wordVecs_corpus <- as.matrix (  wordVecs_corpus[,-1] )  
        row.names(wordVecs_corpus) <- wordVecs_rowNames
        rm(wordVecs_rowNames)
        }
        
        docSummaries <- FastScale( undergrad(my_text,  wordVecs = wordVecs_corpus,
                                             word_quantiles = c(0.50)))
        rm(my_text);
        if(use_RCE == T){rm(wordVecs_corpus)}

        ## Make "CATEGORY" coding a factor variable
        corpus_categoryVec <- gsub(as.character(corpus_categoryVec), pattern = "[[:punct:]]", replace = "_")
        corpus_categoryVec <- gsub(corpus_categoryVec, pattern = "[[:space:]]", replace = "_")
        corpus_categoryVec <- as.factor(corpus_categoryVec)
        
        #save for precomputations 
        write.csv(corpus_DTM, file = paste("./", filename_saved_data, "/DTM_", csv_name, sep = ""))
        write.csv(docSummaries, file = paste("./", filename_saved_data, "/DFM_", csv_name, sep = ""))
        write.csv(as.matrix(corpus_categoryVec), file = paste("./", filename_saved_data, "/CATS_", csv_name, sep = ""))
        
        eval(parse(text = sprintf("rm(%s)", paste( numericize_fxns_added, collapse = ","))))
        rm(corpus_DTM,docSummaries); 
      }
    } 

  compute_classifiers_ = compute_classifiers
  compute_splits_ = compute_splits 
  if(compute_splits == T & compute_classifiers == F){ 
    #If we allowed this, then errors would no longer correspond to the same data split. 
    stop("INCONSISTENT SIMULATION SPECIFICATIONS")  
  }
  if(compute_splits == F){ 
    try_splits = try(load(sprintf(paste("./", filename_saved_data, "/SPLITS_%s.RData",sep = ""),
                     gsub(csv_name, pattern = ".csv", replace = ""))), T)
    if(class(try_splits) == "try-error"){ 
      compute_splits_ <- T
    }
    if(class(try_splits) != "try-error"){ 
      if(length(indices_list) == iterations){ 
        compute_splits_ <- F
      }
      if(length(indices_list) != iterations){ 
        compute_classifiers_ <- compute_splits_ <- T 
      }
    }
  }
  
  if(compute_splits == T){
    samp_fxns_added <- ls(); for(support_file in support_files_samp){source(sprintf("./%s/%s",filename_support, support_file)) }
    samp_fxns_added <- ls()[!ls() %in% samp_fxns_added]
    csv_error <- rep(NA, length=iterations)
    sampling_scheme_used <- rep(NA, times = iterations)
    if(sampling_scheme != "BalancedCombination"){ProjectionsMat = NULL }
    if(sampling_scheme == "BalancedCombination"){
      iter_labeledIndicator = c(rep(1,times = 300),rep(0,times=length(corpus_categoryVec)-300))
      selected_indices = 1:length(corpus_categoryVec)
      selected_indices = selected_indices[corpus_categoryVec[selected_indices] %in% 
                                            unique(corpus_categoryVec[selected_indices[iter_labeledIndicator==1]])]
      iter_labeledIndicator = iter_labeledIndicator[selected_indices]
      iter_corpus_categoryVec = as.character( corpus_categoryVec[selected_indices] )  
      dfm_pastein_labeled  = paste0("sed -n '" , paste0(  c(1,selected_indices[iter_labeledIndicator==1] + 1) , 
                                                          collapse = "p\n " ) ,"p' ",sprintf("'%s'",dfm_loc), collapse = "" )
      dfm_pastein_unlabeled  = paste0("sed -n '" , paste0(  c(1,selected_indices[iter_labeledIndicator==0] + 1) , 
                                                            collapse = "p\n " ) ,"p' ",sprintf("'%s'",dfm_loc), collapse = "" )
      ProjectionsMat <- readme(dfm =  list(labeled_cmd   = dfm_pastein_labeled,
                                                unlabeled_cmd = dfm_pastein_unlabeled),
                               labeledIndicator = iter_labeledIndicator, 
                               sgdIters      = 500,categoryVec = iter_corpus_categoryVec, 
                               nBoot = 1, justTransform = T)$transformed_dfm
    }
    indices_list <- list()
    for(it in 1:iterations){ 
        cat(paste("Generating Dataset :", it, "\n"))
        sampling_scheme_old <- sampling_scheme
        if(sampling_scheme == "BalancedCombination"){
          if(it == 1){ 
            sampling_scheme_vec <- c( "Max_XDiv", "Min_XDiv","Uniform_XDiv" , 
                     "Historical","Sequential", "HistoricalMaxU", "HistoricalMaxL","HistoricalVarySampSize",
                     "Ahistorical_NoReuse","firat2018",
                     "breakdown_sampling","breakdown_sampling2", 
                     "JustPermutations", "MinDiv", "MaxDiv",
                     "Ahistorical_NoReuse2","Ahistorical_aykut","Ahistorical_aykut_quantification")
            sampling_scheme_vec <- sample(sampling_scheme_vec)
            sampling_scheme_vec <- rep(sampling_scheme_vec, times = ceiling( iterations/length(sampling_scheme_vec)) ) 
          }
          sampling_scheme <- sampling_scheme_vec[it]
        }
        labeled_sz_checked = labeled_sz_; unlabeled_sz_checked = unlabeled_sz_
        target_sz = labeled_sz_+unlabeled_sz_
        if(target_sz >= length(corpus_categoryVec)){ 
          frac_seq = seq(0.01, 0.99, length.out = 100)
          cand_sz = sapply(frac_seq, 
                           function(xa){ xa  * target_sz})
          labeled_sz_checked = round(labeled_sz_ * frac_seq[which.min(abs(cand_sz - length(corpus_categoryVec) + 3))][1])
          unlabeled_sz_checked = round(unlabeled_sz_ * frac_seq[which.min(abs(cand_sz - length(corpus_categoryVec) + 3))][1])
        }
        while_ok <- F; while_ok_counter = 0
        while(while_ok == F){ 
          if(while_ok_counter > 0){print(sprintf("BAD: %s", while_ok_counter))}
          while_ok_counter = while_ok_counter + 1
          INDICES_LIST = try(GetLabeledUnlabeledIndices( csv_category = as.factor(corpus_categoryVec),
                                                       sampling_scheme = sampling_scheme, 
                                                       labeled_sz = labeled_sz_checked, unlabeled_sz = unlabeled_sz_checked,
                                                       ProjectionsMat_input = ProjectionsMat ), T)  
          if(length(unique(corpus_categoryVec[INDICES_LIST[[1]]])) > 1){while_ok = T}
          if(while_ok_counter > 10){while_ok = F; INDICES_LIST <- try("a"+ 2, T)}
          
        } 
        if(class(INDICES_LIST) == "try-error"){ 
          if(use_RCE == F){print("SAMPLING ERROR, DEFAULTING TO HISTORICAL");browser()}
          INDICES_LIST = try(GetLabeledUnlabeledIndices( csv_category = corpus_categoryVec,
                                                         sampling_scheme = "Historical", 
                                                         labeled_sz = labeled_sz_checked, unlabeled_sz = unlabeled_sz_checked, 
                                                         ProjectionsMat_input = ProjectionsMat), T)  
        }
        labeled_indices <- INDICES_LIST$labeled_indices
        unlabeled_indices <- INDICES_LIST$unlabeled_indices
        LC <- corpus_categoryVec[INDICES_LIST$labeled_indices]
        UC <- corpus_categoryVec[INDICES_LIST$unlabeled_indices]
        LC_CATS_KEEP <- table( LC); LC_CATS_KEEP <- names(LC_CATS_KEEP)[LC_CATS_KEEP>10]
        unlabeled_indices <- unlabeled_indices[UC %in% LC_CATS_KEEP]
        labeled_indices <- labeled_indices[LC %in% LC_CATS_KEEP]
        indices_list[[it]] <- list(labeled_indices = labeled_indices, 
                                   unlabeled_indices = unlabeled_indices)
        sampling_scheme_used[it] <- sampling_scheme
        sampling_scheme <- sampling_scheme_old
    }
    indices_list <- indices_list[unlist(lapply(indices_list, function(as){ 
      x_ = try(min(length(as[[1]]), length(as[[2]])), T)
      if(class(x_) == "try-error"){x_ <- 0}
      return( x_ ) })) > 0]
    names(indices_list) <- sampling_scheme_used
    save(indices_list, file = sprintf("./%s/SPLITS_%s.RData",filename_saved_data ,
                                      gsub(csv_name, pattern = ".csv", replace = "")))
    eval(parse(text = sprintf("rm(%s)", paste(samp_fxns_added, collapse = ","))))
    rm(ProjectionsMat,indices_list)
  } 
  
  #first do classifier + quantifier analysis 
  if(compute_classifiers == F){ 
      test_ = try(sum(is.na(as.data.frame(data.table::fread(sprintf("./%s/%s", filename_saved_results, out_file) ))[,-1]["continuousSVM_error"] )) > 0, T) 
      if(class(test_) == "try-error"){ compute_classifiers_ <- T } 
      if(class(test_) != "try-error"){ 
        try( if(test_ == 0){  compute_classifiers_ <- F } , T)  
        if(test_ != 0){  compute_classifiers_ <- T }
      }
  } 
  
  if(compute_classifiers_ == T){ 
    class_fxns_added <- ls(); for(support_file in support_files_class){source(sprintf("./%s/%s", filename_support,support_file)) }
    class_fxns_added <- ls()[!ls() %in% class_fxns_added]
    
    for (it in 1:iterations){
      print( it )  
      labeled_indices <- indices_list[[it]]$labeled_indices
      unlabeled_indices <- indices_list[[it]]$unlabeled_indices
      
      selected_indices = sort(c(labeled_indices, unlabeled_indices))
      iter_labeledIndicator <- rep(0,times = length(corpus_categoryVec))
      iter_labeledIndicator[labeled_indices] <- 1
      iter_labeledIndicator <- iter_labeledIndicator[selected_indices]
      iter_categoryVec = as.character( corpus_categoryVec[selected_indices]  )
      
      #################### Run the relevant code ####################### 
      dtm_pastein = paste0("sed -n '" , paste0( c(1,selected_indices+1),collapse = "p\n " ) ,
                           "p' ",dtm_loc, collapse = "" )
      dfm_pastein  = paste0("sed -n '" , paste0(  c(1,selected_indices + 1) , 
                                                  collapse = "p\n " ) ,
                            "p' ",sprintf("'%s'",dfm_loc), collapse = "" )
      print("Starting Classifier + Quantifier Analysis")
      
      discreteResults <- try( eval(parse(text = eval_discrete ) ), T)
      continuousResults <- try( eval(parse(text = eval_continuous ) ), T)
      outer_start_time=proc.time()[3]
      if(class(discreteResults) == "try-error"){print(discreteResults)}
      if(class(continuousResults) == "try-error"){print(continuousResults)}
      
      #################### Save estimates ##############################
      new_error <-  try(data.frame(dataset=paste(ijack, " - Analysis",sep=""), 
                                   logbits_inmemory = log(sum(sort( sapply(ls(),function(x){object.size(get(x))})))),
                                   logbits_inmemory2 = NA,
                                   elapsed_time = NA,
                                   initial_seed = c(my_seed), 
                                   
                                   readme2_error= NA,
                                   readme2_1_error= NA,  
                                   readme2_2_error= NA,  
                                   readme2_3_error= NA,  
                                   
                                   readme_error = discreteResults$error_readme , 
                                   
                                   continuousRegression_error = continuousResults$ErrorResultsEnsemble_continuous$Regression_est,
                                   discreteRegression_error = discreteResults$ErrorResultsEnsemble_discrete$Regression_est,
                                   
                                   #continuous comparisons 
                                   continuousEnsemble_error = continuousResults$ErrorResultsEnsemble_continuous$ensemble,
                                   continuousBayes_error = continuousResults$ErrorResultsEnsemble_continuous$NaiveBayes_est,
                                   continuousSVM_error = continuousResults$ErrorResultsEnsemble_continuous$SVM_est,
                                   continuousForest_error = continuousResults$ErrorResultsEnsemble_continuous$Forest_est,
                                   
                                   #continuous pure count comparisons 
                                   continuousRegression_error_count = continuousResults$ErrorResultsEnsemble_continuous_count$Regression_est_count,
                                   continuousEnsemble_error_count = continuousResults$ErrorResultsEnsemble_continuous_count$ensemble_count,
                                   continuousBayes_error_count = continuousResults$ErrorResultsEnsemble_continuous_count$NaiveBayes_est_count,
                                   continuousSVM_error_count = continuousResults$ErrorResultsEnsemble_continuous_count$SVM_est_count,
                                   continuousForest_error_count = continuousResults$ErrorResultsEnsemble_continuous_count$Forest_est_count,
                                   
                                   #discrete comparisons 
                                   discreteEnsemble_error = discreteResults$ErrorResultsEnsemble_discrete$ensemble,
                                   discreteBayes_error = discreteResults$ErrorResultsEnsemble_discrete$NaiveBayes_est,
                                   discreteSVM_error = discreteResults$ErrorResultsEnsemble_discrete$SVM_est,
                                   discreteForest_error = discreteResults$ErrorResultsEnsemble_discrete$Forest_est,
                                   
                                   #discrete pure count comparisons 
                                   discreteRegression_error_count = discreteResults$ErrorResultsEnsemble_discrete_count$Regression_est_count,
                                   discreteEnsemble_error_count = discreteResults$ErrorResultsEnsemble_discrete_count$ensemble_count,
                                   discreteBayes_error_count = discreteResults$ErrorResultsEnsemble_discrete_count$NaiveBayes_est_count,
                                   discreteSVM_error_count = discreteResults$ErrorResultsEnsemble_discrete_count$SVM_est_count,
                                   discreteForest_error_count = discreteResults$ErrorResultsEnsemble_discrete_count$Forest_est_count,
                                   
                                   #quantification error metrics 
                                   va2_error  = discreteResults$va2_error,
                                   quantify_friedman_error = discreteResults$error_quantify[[1]]["friedman"], 
                                   quantify_prob_error = discreteResults$error_quantify[[1]]["prob"], 
                                   quantify_mixtureL2_error = discreteResults$error_quantify[[1]]["mixtureL2"], 
                                   quantify_mixtureL1_error = discreteResults$error_quantify[[1]]["mixtureL1_QUANT"], 
                                   quantify_mixtureHPMF_error = discreteResults$error_quantify[[1]]["mixtureHPMF"], 
                                   quantify_adjCount_error = discreteResults$error_quantify[[1]]["adjCount"], 
                                   quantify_medianSweep_error = discreteResults$error_quantify[[1]]["medianSweep"], 
                                   quantify_hdx_error = discreteResults$error_quantify[[1]]["hdx"], 
                                   quantify_va1_error = discreteResults$error_quantify[[1]]["va1"], 
                                   quantify_naive_error = discreteResults$error_quantify[[1]]["naive"], 
                                   
                                   PrDDiv_withMatching= NA, PrDDiv = NA, 
                                   ESGivenDDiv_withMatching = NA, ESGivenDDiv = NA, 
                                   nCat = length( unique(corpus_categoryVec[ labeled_indices ] ) ), 
                                   iteration = it, corpus_numb = counter_, nboot = nboot, 
                                   n_labeled = sum(iter_labeledIndicator==1),  n_unlabeled = sum(iter_labeledIndicator==0), 
                                   sampling_scheme = names(indices_list)[it]), T)
      row.names(new_error) <- NULL ; errors <- rbind(errors,new_error);rm(new_error)
      write.csv(errors, sprintf("./%s/%s", filename_saved_results, out_file) ) 
      
    }
    eval(parse(text = sprintf("rm(%s)", paste(class_fxns_added, collapse = ","))))
    rm(errors); 
  } 
  
  if(compute_readme2 == T){ 
    for (it in 1:iterations){
        print(it)
        labeled_indices <- indices_list[[it]]$labeled_indices
        unlabeled_indices <- indices_list[[it]]$unlabeled_indices
        selected_indices = sort(c(labeled_indices, unlabeled_indices))
        iter_labeledIndicator <- rep(0,times = length(corpus_categoryVec))
        iter_labeledIndicator[labeled_indices] <- 1
        iter_labeledIndicator <- iter_labeledIndicator[selected_indices]
        iter_categoryVec = as.character( corpus_categoryVec[selected_indices]  )

        dfm_pastein_labeled  = paste0("sed -n '" , paste0(  c(1,selected_indices[iter_labeledIndicator==1] + 1) , 
                                                            collapse = "p\n " ) ,"p' ",sprintf("'%s'",dfm_loc), collapse = "" )
        dfm_pastein_unlabeled  = paste0("sed -n '" , paste0(  c(1,selected_indices[iter_labeledIndicator==0] + 1) , 
                                                              collapse = "p\n " ) ,"p' ",sprintf("'%s'",dfm_loc), collapse = "" )
        outer_start_time <- proc.time()[3]
        unlabeled_pd  = c(prop.table(table(as.factor(iter_categoryVec)[iter_labeledIndicator==0])))
        labeled_pd  = c(prop.table(table(as.factor(iter_categoryVec)[iter_labeledIndicator==1])))
        
        readme2Results <- try(readme(dfm = list(labeled_cmd   = dfm_pastein_labeled,
                                                 unlabeled_cmd = dfm_pastein_unlabeled),
                                 labeledIndicator = iter_labeledIndicator, nbootMatch = 50, kMatch=3,sgdIters   = 1000,
                                 categoryVec = iter_categoryVec, nBoot = nboot),T)

        if(class(readme2Results) == "try-error"){print( readme2Results )}
         tf$keras$backend$clear_session(); tf$reset_default_graph()
         mem_at_iter = pryr::mem_used()
         print(mem_at_iter)
         readme2Results = list(error_readme2 = sum(abs(readme2Results$point_readme[names(unlabeled_pd)] - unlabeled_pd)),
                               error_readme2_1 = sum(abs(readme2Results$point_readme_1[names(unlabeled_pd)] - unlabeled_pd)),
                               error_readme2_2 = sum(abs(readme2Results$point_readme_2[names(unlabeled_pd)] - unlabeled_pd)),
                               error_readme2_3 = sum(abs(readme2Results$point_readme_3[names(unlabeled_pd)] - unlabeled_pd)))
         elapsed_time = proc.time()[3]-outer_start_time

        if(is.null(readme2Results$PrDDiv)){
          readme2Results$PrDDiv <- readme2Results$PrDDiv_withMatching <- NA 
          readme2Results$ESGivenDDiv_withMatching <- readme2Results$ESGivenDDiv <- NA
        }
        
        results = as.data.frame(data.table::fread(sprintf("./%s/%s", filename_saved_results, out_file) ))[,-1] 
        results[it,"PrDDiv"] <- sum(abs(labeled_pd[names(unlabeled_pd)] - unlabeled_pd))
        results[it,"logbits_inmemory2"] <- log(mem_at_iter)
        results[it,"elapsed_time"] <- elapsed_time
        results[it,"readme2_error"] <- readme2Results$error_readme2
        if(is.na( results[it,"readme2_error"])){browser()}
        for(aj in 1:3){ eval(parse(text = sprintf("results$readme2_%s_error[it] <- readme2Results$error_readme2_%s",aj,aj))) } 
        results[it,"PrDDiv_withMatching"] <- readme2Results$PrDDiv_withMatching
        results[it,'ESGivenDDiv_withMatching'] <- readme2Results$ESGivenDDiv_withMatching
        results[it,"ESGivenDDiv"] <- readme2Results$ESGivenDDiv
        write.csv(results,sprintf("./%s/%s", filename_saved_results, out_file) )
        rm(results,readme2Results,unlabeled_pd)
    }
  } 
}
}
tf$keras$backend$clear_session()

#analyze the data 
if(T == T){
  options("scipen"=4, "digits"=4)
  par(mfrow = c(1,1))
  
  f2n <- function(.){as.numeric(as.character(.))}
  
  my_data <- c()
  my_csvs <- list.files(sprintf("./%s/", filename_saved_results))
  for(my_csv in my_csvs){df_i <- try(read.csv(sprintf("./%s/%s", filename_saved_results, my_csv  ))[,-1], T) ;my_data <- rbind(my_data, df_i) }
  my_data[,!colnames(my_data) %in% c("sampling_scheme", "dataset")] <- apply(my_data[,!colnames(my_data) %in% c("sampling_scheme", "dataset")], 2, f2n)

  #give full names to sampling schemes
  if(T == T){ 
    my_data$sampling_scheme_fullName = as.character( my_data$sampling_scheme ) 
    my_data$sampling_scheme_fullName[my_data$sampling_scheme == "Historical"] <- "*Empirical*"
    my_data$sampling_scheme_fullName[my_data$sampling_scheme == "HistoricalVarySampSize"] <- "Empirical, Varied L. Size"
    my_data$sampling_scheme_fullName[my_data$sampling_scheme == "Sequential"] <- "Sequential"
    my_data$sampling_scheme_fullName[my_data$sampling_scheme == "HistoricalMaxL"] <- "Empirical, Max. L. Size"
    my_data$sampling_scheme_fullName[my_data$sampling_scheme == "HistoricalMaxU"] <- "Empirical, Max. U. Size"
    my_data$sampling_scheme_fullName[my_data$sampling_scheme == "JustPermutations"] <- "Random Subsets"
    my_data$sampling_scheme_fullName[my_data$sampling_scheme == "MinDiv"] <- "Min. P.D."
    my_data$sampling_scheme_fullName[my_data$sampling_scheme == "Ahistorical_NoReuse"] <- "Unif. Random P.D."
    my_data$sampling_scheme_fullName[my_data$sampling_scheme == "MaxDiv"] <-  "Max P.D."
    my_data$sampling_scheme_fullName[my_data$sampling_scheme == "Min_XDiv"] <- "Min. S.C."
    my_data$sampling_scheme_fullName[my_data$sampling_scheme == "Uniform_XDiv"] <- "Random S.C."
    my_data$sampling_scheme_fullName[my_data$sampling_scheme == "Max_XDiv"] <- "Max. S.C."
    my_data$sampling_scheme_fullName[my_data$sampling_scheme == "Ahistorical_aykut"] <- "Random Walk"
    my_data$sampling_scheme_fullName[my_data$sampling_scheme == "Ahistorical_aykut_quantification"] <- "Chron. L., Unif. U."
    my_data$sampling_scheme_fullName[my_data$sampling_scheme == "firat2018"] <- "Random L., Unif. U."
    my_data$sampling_scheme_fullName[my_data$sampling_scheme == "breakdown_sampling"] <- "Extr. Shift"
    my_data$sampling_scheme_fullName[my_data$sampling_scheme == "Ahistorical_NoReuse2"] <- "Unif. Random Props."
    my_data$sampling_scheme_fullName[my_data$sampling_scheme == "breakdown_sampling2"] <- "Extr. Features in L. Set"  
  }
  
  pdf("./BIGSIM_readme.pdf");
  set.seed(1)
  if(T == T){ 
    yTag <- "readme"; ylab_use <- "Improvement in SAE, Readme2 vs. Readme"
    X_axis <- tapply(my_data$PrDDiv, my_data$sampling_scheme_fullName, function(x) mean(x,na.rm = T));
    XLab <- "Average Proportion Divergence"
    if(yTag == "readme"){ Y_axis <- -1*tapply((my_data$readme2_error-my_data$readme_error), my_data$sampling_scheme_fullName, function(x){mean(x,na.rm = T)})  }
    pd_by_design = tapply(my_data$PrDDiv, my_data$sampling_scheme_fullName, function(x){mean(x,na.rm = T)})
    if(T == T){ 
      my_cols <-  rep("blue", times = length(X_axis));my_cols[names(X_axis) == "*Empirical*"] <- "red"
      my_cols2 <-  rep("blue", times = length(X_axis));my_cols2[names(X_axis) == "*Empirical*"] <- "red"
      my_cex <- rep(0.65, times = length(X_axis))
      my_cex[names(X_axis) == "*Empirical*"] <- 1.25
      par(mar=c(5, 5, 4, 2) )
      xaxt_use <- c(min(X_axis)-0.55*(max(X_axis)-min(X_axis)), 
                max(X_axis)+2*(max(X_axis)-min(X_axis)))
      plot(X_axis,Y_axis, 
           ylim = c(min(0-0.01,Y_axis),  max(Y_axis)), 
           xlim = xaxt_use, 
           cex.lab = 2, 
           xaxt = "n", 
           col = my_cols, 
           xlab = XLab, ylab = ylab_use, 
           cex = 0); 
      my_letters = letters[(1:length(X_axis)) ][order(pd_by_design)]
      text(X_axis,Y_axis, labels = my_letters, col=my_cols,cex=1.5 ) 
      xaxt_seq <- seq(min(X_axis),  max(X_axis), length.out = 5)
      axis(side = 1,  at = xaxt_seq, labels = round(xaxt_seq, 2))
      abline(h = 0, lty = 3)
      text((xaxt_use[1]+xaxt_use[2])/2-0.1, 0.01, labels = "higher error than readme2", col = "black",cex=1.4)
      text((xaxt_use[1]+xaxt_use[2])/2-0.1, -0.01, labels = "lower error than readme2", col = "black",cex=1.4)
      yjig <- rep(0, length(X_axis))
      xjig <- 0.06 *  (log(nchar(names(X_axis)))) + 0.003*nchar(names(X_axis))
      alternate_ = sample(c(T,F),length(Y_axis), replace = T)
      xjig[alternate_] <- -1*xjig[alternate_]
      X_axt_jigged <- X_axis+xjig
      Y_axt_jigged <- Y_axis+yjig; 
      legend("topright", 
             pch = sort(my_letters),
             legend = names(X_axis)[order(my_letters,decreasing =F)],
             col=my_cols2[order(my_letters,decreasing=F)],
             cex=1.10)
    } 
  }
  dev.off()
  
  pdf("./BIGSIM_all.pdf");
  if(T == T){ 
    AggFxn <- function(.){median(., na.rm = T)}
    set.seed(1)
    par(mar=c(16, 5, 4, 2) )
    Y_axis_mat <- c()
    y_names <- c(grep(colnames(my_data), pattern = "readme", value = T), 
                 grep(colnames(my_data), pattern = "va2",value = T),
                 grep(colnames(my_data), pattern = "quantify", value = T),
                 grep(colnames(my_data), pattern = "discrete", value = T), 
                 grep(colnames(my_data), pattern = "continuous", value = T))
    plot(0,0, 
         ylim = c(-0.20, 0.50),
         xlim = c(1,length(unique(my_data$sampling_scheme))), 
         cex.lab = 2, xaxt = "n", xlab = "",
         ylab = "Improvement in SAE", 
         cex = 0, pch = 3 ); 
    abline(h = 0, lty = 3)
    text(x=8,y=-0.10,labels = "lower error than readme2")
    text(x=8,y=0.40,labels = "higher error than readme2")
    counter_iter <- 0 
    overall_value <- rep(0, times = length(y_names))
    names(overall_value) <- y_names
    y_names = y_names[y_names!="readme_error"]; y_names <- c(y_names,"readme_error")
    for(outer_y in y_names){ 
    Y_axis <- -1*tapply((my_data$readme2_error-my_data[,outer_y]), my_data$sampling_scheme_fullName, AggFxn) 
    X_axis <- rank(tapply(my_data$PrDDiv, my_data$sampling_scheme_fullName, mean)); 
    X_axis_names = names(X_axis) 
    X_axis_pd <- tapply(my_data$PrDDiv, my_data$sampling_scheme_fullName, mean); 
    #X_axis_names = sapply(1:length(X_axis), function(da){ sprintf("%s (%s)", X_axis_names[da], round(X_axis_pd[da],2))})
    my_col <- rep("black", length(X_axis))
    my_col[names(X_axis) == "*Empirical*"] <- "red"
    if(outer_y == "readme_error"){my_col <- rep("blue", length(X_axis))}
    if(outer_y != "readme2"){ 
      counter_iter <- counter_iter + 1 
      xaxt <- c(min(X_axis)-0.55*(max(X_axis)-min(X_axis)), 
                max(X_axis)+0.55*(max(X_axis)-min(X_axis)))
      X_axis <- X_axis + runif(length(X_axis), -0.02, 0.2)
      if(counter_iter == 1){
        cex_vec = as.numeric(gtools::quantcut(-nchar(names(X_axis)),q=4))
        cex_vec[cex_vec==1] <- 1.3
        cex_vec[cex_vec>1] <- 1.5
        text(X_axis, -0.25, srt = 90, adj = 1,
             labels = X_axis_names, xpd = TRUE, col = my_col,cex=cex_vec)
        text(length(X_axis), -1.34, srt = 0, adj = 1,cex = 1.5, 
             labels =  "Sim. Design (Ordered by Average Proportion Div.)", xpd = TRUE)
      } 
      points(X_axis,Y_axis,# labels = my_label, 
           cex.lab = 1.5, 
           col = my_col, 
           cex = 0.50, pch = 1)#3 ); 
    } 
      overall_value[counter_iter] <- AggFxn(Y_axis)
      Y_axis_mat <- rbind(Y_axis_mat , Y_axis)
    } 
    points(X_axis, apply(Y_axis_mat,2,function(eraaa){median(eraaa, na.rm = T)}), pch = "-----",cex = 2, col = "darkgray")
    row.names(Y_axis_mat) <- y_names
    winner_indices <- unlist( apply(Y_axis_mat, 2, function(x){
      return(  which.min(x) )  
      }))
  
    top_ranks = (  sort(rowMeans(apply(Y_axis_mat, 2, rank)))  )
    print( head(  sort(rowMeans(apply(Y_axis_mat, 2, rank)))  )  )  
    print(  head(sort( prop.table(table(  row.names(Y_axis_mat)[winner_indices] ) ) , 
            decreasing = T)))
    temp_ = head(sort( prop.table(table(  row.names(Y_axis_mat)[winner_indices] ) ) , 
               decreasing = T))
    } 
  dev.off()

  pdf("./FULL_prop_bysubsample.pdf")
  my_q <- 10  
  if(T == T){ 
    set.seed(2)
    baseline_name <- "readme2_error"
    axis_vec <- c(my_data$PrDDiv)
    competitor_names <- c("continuousRegression_error","continuousEnsemble_error", "continuousBayes_error", 
                          "continuousSVM_error", "continuousForest_error","continuousRegression_error_count", 
                          "continuousEnsemble_error_count", "continuousBayes_error_count", "continuousSVM_error_count", 
                          "continuousForest_error_count", 
                          "discreteEnsemble_error", "discreteBayes_error", "discreteForest_error", 
                          "discreteRegression_error", "discreteSVM_error", "discreteEnsemble_error_count", 
                          "discreteBayes_error_count", "discreteForest_error_count", "discreteRegression_error_count",
                          "discreteSVM_error_count", 
                           
                          "quantify_adjCount_error", "quantify_friedman_error", "quantify_hdx_error", 
                          "quantify_medianSweep_error", "quantify_mixtureHPMF_error", "quantify_mixtureL1_error", 
                          "quantify_mixtureL2_error", "quantify_prob_error", 
                          
                          "readme_error", "va2_error", 
                          
                          "PrDDiv")
    EstimateTypeKey <- SpaceKey <- rep("", times = length(competitor_names))
    SpaceKey[grepl(competitor_names, pattern = "continuous")] <- "Cont."
    SpaceKey[!grepl(competitor_names, pattern = "continuous")] <- "Discrete"
    EstimateTypeKey[grepl(competitor_names, pattern = "quantify")] <- "Quant."
    EstimateTypeKey[grepl(competitor_names, pattern = "va2")] <- "Quant."
    EstimateTypeKey[grepl(competitor_names, pattern = "readme_error")] <- "Quant."
    EstimateTypeKey[grepl(competitor_names, pattern = "count")] <- "CountClass"
    EstimateTypeKey[grepl(competitor_names, pattern = "PrDDiv")] <- "Proportion Div."
    EstimateTypeKey[EstimateTypeKey==""] <- "AveProbs"
    
    TypeKey <- paste(SpaceKey, EstimateTypeKey )
    
    for(asdf in 1:length(competitor_names)){ 
      eval(parse(text = sprintf("base_error_vec%s <- c(my_data$%s)", 
                                asdf, competitor_names[asdf]) ) )
    }
    alt_error_vec <- c(my_data[,baseline_name])
    
    par(mfrow = c(1,1))
    library(gtools)
    
    axis_vec_orig <- axis_vec
    axis_vec <- c(gtools::quantcut(axis_vec, q = my_q))
    names(axis_vec) <- names(axis_vec_orig)
    NComparisons <- length(competitor_names)
    
    for(iter_i in 1:NComparisons){ 
      eval(parse(text  = sprintf("v%s <- (alt_error_vec <= base_error_vec%s)", iter_i, iter_i)))
      eval(parse(text = sprintf("v%s <-  tapply(v%s, axis_vec, function(x){mean(x,na.rm = T)})", iter_i,iter_i)))
    } 
  
    if(T == T){ 
      par(mar=c(5, 5, 4, 2) )
      plot(names(v1),  v1, 
           pch = "A",
           ylim = c(0, 1.05),
           cex.lab = 2, 
           xaxt = "n", 
           main = "", 
           cex=0.5,
           cex.main = 2, 
           col = "darkgray", 
           ylab = c("Proportion of Datasets with Higher SAE"), 
           xlab = "", # "Proportion Divergence (Quantiles)", 
           type = "b")
      axis(1,names(v1), names(v1))
      abline(h = 0.5, lwd = 3, lty = 2, col = "black")
      text(length(v1) - 2.5, 0.54, label = "higher error than readme2",cex=1.5)
      text(length(v1) - 2.5, 0.46, label = "lower error than readme2",cex=1.5)
      
      myDF <- eval(parse(text =  paste("data.frame(", paste(sprintf("v%s = v%s", 1:NComparisons, 1:NComparisons), 
                                                            collapse = ","), ")")))
      best_vec <- c()
      for(i in 1:ncol(myDF)){
        accept_x_locals <- c(1,2,3)
        target <- myDF[accept_x_locals,i] 
        error_mat <- matrix(NA, nrow = length(accept_x_locals), ncol = ncol(myDF))
        for(j in (1:ncol(myDF))[!1:ncol(myDF) %in% i] ){ 
          error_mat[,j] <- abs(myDF[accept_x_locals,j] - target)
        }
      }
      
      #colors and shapes and such 
      col_unique = (rainbow(4)) 
      pch_vec <- col_vec <- rep("", times = length( EstimateTypeKey )  )
      col_vec[EstimateTypeKey==unique(EstimateTypeKey)[1]] <- col_unique[1]
      col_vec[EstimateTypeKey==unique(EstimateTypeKey)[2]] <- col_unique[2]
      col_vec[EstimateTypeKey==unique(EstimateTypeKey)[3]] <- col_unique[3]
      col_vec[EstimateTypeKey==unique(EstimateTypeKey)[4]] <- col_unique[4]
      pch_vec[EstimateTypeKey==unique(EstimateTypeKey)[1]] <- "A"
      pch_vec[EstimateTypeKey==unique(EstimateTypeKey)[2]] <- "C"
      pch_vec[EstimateTypeKey==unique(EstimateTypeKey)[3]] <- "Q"
      pch_vec[EstimateTypeKey==unique(EstimateTypeKey)[4]] <- "B"
      lty_vec = SpaceKey
      lty_vec[lty_vec=="Cont."] <- 1
      lty_vec[lty_vec=="Discrete"] <- 3
      lty_vec[EstimateTypeKey == "Proportion Div."] <- 1
      lty_vec <- f2n(lty_vec)
      
      #add them 
      for(iter_i in 1:NComparisons){ 
        if(eval(parse(text = sprintf("!all(is.na(v%s))", iter_i)))){ 
          eval(parse(text = sprintf("v%s[which(v%s == 1)] <-v%s[which(v%s == 1)] +  rnorm(length(v%s[which(v%s == 1)]), sd = 0.01)",  iter_i, iter_i, iter_i, iter_i, iter_i, iter_i)))
          eval(parse(text = sprintf("points( f2n(names(v%s))+rnorm(length( names(v%s)), sd = 0.01), v%s, 
                                    col = col_vec[iter_i],lty = lty_vec[iter_i],
                                    type = 'b',pch=pch_vec[iter_i],cex=0.5)", iter_i, iter_i, iter_i)))
        }
      }
      
      legend("bottomright",legend = c("Ave Probs (C)", "Count+Classify (C)", "Ave Probs (D)", "Count+Classify (D)", 
                        "Quantification (D)", "Baseline Pr(D) Divergence"), col = c(col_unique[1],
                                                       col_unique[2],
                                                       col_unique[1],
                                                       col_unique[2],
                                                       col_unique[3],
                                                       col_unique[4]),
             pch=c("A", "C", "A", "C", "Q","B"),lty = c(1,1,2,2,2), bty = "n",cex=1.25)
    }
  } 
  dev.off() 

  #Arrow plots
  AggFxn <- function( x){mean(x, na.rm = T)}
  arrow_fxn <- function(alt_error_vec, base_error_vec, 
                          xlab = NULL, 
                          x_data = NULL){
      if(is.null(xlab)){xlab <- "Ordered by Improvement"}
      
      if(is.null(x_data)){order_by <-  -((alt_error_vec - base_error_vec)) } 
      if(!is.null(x_data)){order_by <-  x_data } 
      lty_vec = rep(1, times = length( alt_error_vec ) ) 
      lty_vec[base_error_vec < alt_error_vec] <- 3
      par(mar=c(5, 5, 4, 2) ) 
      base_error_vec <- base_error_vec[order(order_by)]
      alt_error_vec <- alt_error_vec[order(order_by)]
      lty_vec <- lty_vec[order(order_by)]
      plot(0, 
           cex = 0, 
           cex.main = 1, 
           cex.lab = 1.75, 
           xaxt = "n", 
           xlim = c(1, length(base_error_vec)), 
           ylim = c(min(c(base_error_vec, alt_error_vec)), max(c(base_error_vec, alt_error_vec))), 
           #ylim = c(0, max(alt_error_vec, base_error_vec)+0), 
           ylab  = "Sum of Absolute Errors (SAE)", 
           xlab = xlab)
      col_vec <- rep("black", times = length( base_error_vec)) 
      clinton_col = "pink"; enron_col = "red"; immigration_col = "cornsilk3"; stanford_col = "purple"
      col_vec[grepl(names(alt_error_vec), pattern = "enron", ignore.case = T)] <- enron_col
      col_vec[grepl(names(alt_error_vec), pattern = "clinton", ignore.case = T)] <- clinton_col
      col_vec[grepl(names(alt_error_vec), pattern = "immigration", ignore.case = T)] <- immigration_col
      col_vec[grepl(names(alt_error_vec), pattern = "stanford", ignore.case = T)] <- stanford_col
      lwd_vec <- rep(1,length = length( base_error_vec ))
      lwd_vec[col_vec!="black"] <- 2
      arrows(
        x0 = 1:length(alt_error_vec), y0 = base_error_vec,
        x1 = 1:length(alt_error_vec), y1 = alt_error_vec, 
        length = 0.10,
        lwd = lwd_vec,
        #length = 0.1, 
        col = col_vec, 
        lty = lty_vec)
      try(text(x = which(grepl(names(alt_error_vec), pattern = "immigration", ignore.case = T)),  
               y = base_error_vec[grepl(names(alt_error_vec), pattern = "immigration", ignore.case = T)]+0.08, 
               cex = 1.4, 
               label = "Immigration", col = immigration_col, srt = 90), T)
      try(text(x = which(grepl(names(alt_error_vec), pattern = "clinton", ignore.case = T)),  
               y = base_error_vec[grepl(names(alt_error_vec), pattern = "clinton", ignore.case = T)]+0.05, 
               cex = 1.5, 
               label = "Clinton", col = clinton_col, srt = 90), T)
      try(text(x = which(grepl(names(alt_error_vec), pattern = "enron", ignore.case = T)),  
               y = base_error_vec[grepl(names(alt_error_vec), pattern = "enron", ignore.case = T)]+0.04, 
               cex = 1.5, 
               label = "Enron", col = enron_col, srt = 90), T)
      try(text(x = which(grepl(names(alt_error_vec), pattern = "stanford", ignore.case = T)),  
               y = base_error_vec[grepl(names(alt_error_vec), pattern = "stanford", ignore.case = T)]-0.13, 
               cex = 1.5, 
               label = "Stanford", col = stanford_col, srt = 90), T)
    }
  #my_data = my_data[my_data$sampling_scheme=="Historical",]
  if(T == T){ 
    #ARROW 1
    base_error_vec11 <- tapply(my_data$readme_error, as.character(my_data$dataset), function(x) AggFxn(x))
    alt_error_vec <- tapply(my_data$readme2_error, as.character(my_data$dataset), function(x) AggFxn(x))

    pdf("./FArrow.pdf")
    arrow_fxn(alt_error_vec = alt_error_vec, 
              base_error_vec = base_error_vec11, 
              xlab = NULL, 
              x_data = NULL)
    dev.off()
  }
  
 }